/* Figure 1: generate a list of hot majors and get examples of superstar firms */

libname edu '!userprofile\\Dropbox\Education\Replication\Data';

data skew (keep = year major skew1e_major tsmean5_major tsvol5_major);
  set edu.skewnew;
  if major ~= .;
  if year ~= .;
  if skew1e_major = . then delete; 
  if year < 1956 then delete;
  if major = 12 then delete;
run;

proc sort data = skew; by year major; run;

* calculate residual skewness;
proc surveyreg data = skew;
  class 
  year 
  ;
  model skew1e_major = 
  tsmean5_major
  tsvol5_major 
  year
  /solution adjrsq;
  output out = r_skew r = r_skew;
run;

proc sql;
  create table year as
  select distinct year 
  from edu.skewnew
  where 
  1960<= year <= 2020;
quit;

proc sql;
  create table skew2 as
  select a.year, b.major, b.year as year2, b.r_skew as skew1e_major
  from year as a, r_skew as b
  where 0<= a.year - b.year <= 4;
quit;

proc sort data = skew2; by year major year2; run;

* average across the past 5 years;
proc means data = skew2 noprint;
  by year major;
  var skew1e_major;
  output out = skew4
  mean = skew1e_mix;
run;

data skew4; set skew4; if _FREQ_ < 5 then delete; run;

proc sort data = skew4; by year descending skew1e_mix major; run;

data skew4;
  set skew4;
  by year;
  retain n;
  if first.year then do; n = 0; num = 0; end;
  num = n + 1;
  n = num;
run;

data skew5;
  set skew4;
  by year;
  if num = 1; * pick the highest;
run;

proc sql;
  create table hotmajor as
  select a.year, a.major, a.skew1e_mix, b.majortitle
  from skew5 as a, edu.nsf_majors as b
  where a.major = b.major;
quit;

proc means data = hotmajor noprint;
  var skew1e_mix;
  output out = hotmajor_med median = med_skew;
run;

proc sql;
  create table hotmajor as
  select a.*, b.med_skew
  from hotmajor as a, hotmajor_med as b;
quit;

data hotmajor; set hotmajor; hot = 0; if skew1e_mix > med_skew then hot = 1; run;

data edu.hotmajor; set hotmajor; run;
data edu.hotmajorall; set skew4; run;

/*get a list of superstar firms.  First read all the crsp monthly data with nonmissing return*/
DATA crsp;  
set edu.crsp_datanew;
IF not missing (RET);
if RET ne 'C';
if RET ne 'B';
year=year(date);
month=month(date);
if 1<=EXCHCD<=3 and SHRCD in (10 11);
naics_3=compress(substr(naics,1,3));
sic_2=floor(siccd/100);
mktcap = abs(prc)*shrout;
run;

/*annualize monthly returns*/
proc sql noprint;
create table annualReturns as
select *,
  count(ret) as nbMonths,
  exp(sum(log(1+ret))) as cumulativeReturn
from crsp
group by permno, year;
select * from annualReturns;
quit;

proc sort data = annualreturns; by permno year month; run;

data annualreturns;
set annualreturns;
by permno year month;
if last.year;
if  nbmonths>=6;
ann_ret=cumulativeReturn-1;
run;

data annualreturns;
  set annualreturns;
  by permno year;
  lagmktcap = lag(mktcap);
  if first.permno then lagmktcap = .;
run;

/* get # employees, if there are duplicate entries, choose the larger and the most recent */
data emp (keep = lpermno datadate year emp); set edu.crspcompustat_datanew; year = year(datadate); run; 

proc sort data = emp; by lpermno year descending datadate descending emp; run;
proc sort data = emp nodupkey; by lpermno year; run;

proc sql;
  create table annualreturns as
  select distinct a.*, b.emp
  from annualreturns as a left join emp as b
  on a.permno = b.lpermno and a.year = b.year;
quit;

/* for each permco, only the permno that has the largest market cap will be counted (get rid of multiple share classes) */
proc sort data = annualreturns;
  by permco year descending mktcap;
run;

proc means data = annualreturns noprint;
  by permco year;
  var lagmktcap;
  output out = size sum = lagmktcap;
run;

data annualreturns;
  set annualreturns;
  by permco year;
  if first.year;
run;

proc sql;
  create table annualreturns as
  select a.*, b.lagmktcap
  from annualreturns as a, size as b
  where a.permco = b.permco and a.year = b.year;
quit;

/* map SIC2 and NAICS3 into major code */
proc sql;
  create table temp as
  select a.*, b.major 
  from annualreturns as a left join edu.sic_2_major as b
  on a.sic_2 = b.sic_2;
quit;

data temp;
  set temp;
  if year >= 2005 then delete;
run;

data temp2; set annualreturns; naics3 = input(naics_3, 12.); run;

proc sql;
  create table temp2 as
  select a.*, b.major 
  from temp2 as a left join edu.naics_3_major as b
  on a.naics3 = b.naics_3;
quit;

data temp2 (drop = naics3);
  set temp2;
  if year >= 2005;
run;

data annualreturns_major; set temp; run;
proc append base = annualreturns_major data = temp2; run;

/* calculate major-level skewness */
proc sql;
  create table major as
  select a.year, a.major, b.ann_ret, b.permno, year(b.date) as year2, b.emp
  from skew5 as a, annualreturns_major as b
  where a.major = b.major and 0<= a.year - year(b.date)<= 4;
quit;

data major; set major; if emp = . then delete; run;

proc sort data = major;
  by year major permno;
run;

data major;
  set major;
  cumret = log(1+ann_ret);
run;

proc means data = major noprint;
  by year major permno;
  var cumret;
  output out = major2 sum = cumret;
run;

data major2; set major2; fiveyear_ret = exp(cumret) - 1; if _FREQ_ < 5 then delete; run;

proc sql;
  create table major3 as
  select a.*, b.emp
  from major2 as a, major as b
  where a.permno = b.permno and a.year - b.year2 = 4; 
 quit;

proc sort data = major3; by year major; run;

proc univariate data = major3 vardef = df noprint;
by year major;
var fiveyear_ret;
weight emp;
output out=major4 skew=skew1e;
run;

/* identify superstar firm by calculating skewness again by dropping one firm */
* create a big dataset that has all possible firms;
proc sql;
  create table majorall as
  select a.year, a.major, a.permno, b.permno as permno2, b.fiveyear_ret as fiveyear_ret2
  from major2 as a, major2 as b
  where a.year = b.year and a.major = b.major;
quit;

data majorall; set majorall; if permno = permno2 then delete; run;

proc sql;
  create table majorall as
  select a.*, b.emp as emp2
  from majorall as a, major as b
  where a.permno2 = b.permno and a.year - b.year2 = 4; 
 quit;

proc sort data = majorall; by year major permno; run;

proc univariate data = majorall vardef = df noprint;
by year major permno;
var fiveyear_ret2;
weight emp2;
output out=majorall2 skew=skew1e;
run;

proc sql;
  create table majorall4 as
  select a.*, b.skew1e as trueskew
  from majorall2 as a, major4 as b
  where a.year = b.year and a.major = b.major;
quit;

data majorall4;
  set majorall4;
  drop = trueskew - skew1e;
run;

proc sort data = majorall4; by year major descending drop; run;

data superstar;
  set majorall4;
  retain oldcount;
  by year major;
  if first.major then do; 
  oldcount = 0;
  count = 0;
  end;
  count = oldcount + 1;
  oldcount = count;
run;

data superstar (drop = oldcount);
  set superstar;
  if count <= 3; 
run;

proc sql;
  create table superstar as
  select distinct a.*, b.comnam
  from superstar as a left join edu.crsp_names as b
  on a.permno = b.permno and a.year = year(b.date) and month(b.date) = 12;
quit;

proc sort data = superstar; by year count; run;
